# Read data
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
# Check data
dim(spotify_songs)
## [1] 32833 23
# Let's convert into data.table for efficiency reasons
spotify_songs <- as.data.table(spotify_songs)
# Skim the data to get some sense of variable attributes
id_cols <- c('track_id', 'track_album_id', 'playlist_id')
skimr::skim(spotify_songs[, setdiff(colnames(spotify_songs), id_cols)])
| Name | spotify_songs[, setdiff(c… |
| Number of rows | 20 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| data | 0 | 1 | FALSE | 20 | aco: 1, dan: 1, dur: 1, ene: 1 |
I think it was definitely worth skimming the data, since it provide a nice overview of the data. With that in hand, we are able to phrase relevant/interesting questions about the spotify dataset. I am going to classify these questions into two groups:
Enough is said, let’s get to work!
# First check missing data
spotify_songs[is.na(track_name) ]
## track_id track_name track_artist track_popularity
## 1: 69gRFGOWY9OMpFJgFol1u0 <NA> <NA> 0
## 2: 5cjecvX0CmC9gK0Laf5EMQ <NA> <NA> 0
## 3: 5TTzhRSWQS4Yu8xTgAuq6D <NA> <NA> 0
## 4: 3VKFip3OdAvv4OfNTgFWeQ <NA> <NA> 0
## 5: 69gRFGOWY9OMpFJgFol1u0 <NA> <NA> 0
## track_album_id track_album_name track_album_release_date
## 1: 717UG2du6utFe7CdmpuUe3 <NA> 2012-01-05
## 2: 3luHJEPw434tvNbme3SP8M <NA> 2017-12-01
## 3: 3luHJEPw434tvNbme3SP8M <NA> 2017-12-01
## 4: 717UG2du6utFe7CdmpuUe3 <NA> 2012-01-05
## 5: 717UG2du6utFe7CdmpuUe3 <NA> 2012-01-05
## playlist_name playlist_id playlist_genre
## 1: HIP&HOP 5DyJsJZOpMJh34WvUrQzMV rap
## 2: GANGSTA Rap 5GA8GDo7RQC3JEanT81B3g rap
## 3: GANGSTA Rap 5GA8GDo7RQC3JEanT81B3g rap
## 4: Reggaeton viejito<U+0001F525> 0si5tw70PIgPkY1Eva6V8f latin
## 5: latin hip hop 3nH8aytdqNeRbcRCg3dw9q latin
## playlist_subgenre danceability energy key loudness mode speechiness
## 1: southern hip hop 0.714 0.821 6 -7.635 1 0.1760
## 2: gangster rap 0.678 0.659 11 -5.364 0 0.3190
## 3: gangster rap 0.465 0.820 10 -5.907 0 0.3070
## 4: reggaeton 0.675 0.919 11 -6.075 0 0.0366
## 5: latin hip hop 0.714 0.821 6 -7.635 1 0.1760
## acousticness instrumentalness liveness valence tempo duration_ms
## 1: 0.0410 0.00000 0.1160 0.649 95.999 282707
## 2: 0.0534 0.00000 0.5530 0.191 146.153 202235
## 3: 0.0963 0.00000 0.0888 0.505 86.839 206465
## 4: 0.0606 0.00653 0.1030 0.726 97.017 252773
## 5: 0.0410 0.00000 0.1160 0.649 95.999 282707
It seems like we have all the data except for attributes of the artist. This isn’t really surprise because spotify_songs data.table seems to come from a transactional database and thus only the manually adminstered data such as artist name etc.. can be missing. Let’s drop these observations
spotify_songs <- na.omit(spotify_songs)
data_dim <- dim(spotify_songs)
To check upon the granularity of the dataset, look for songs that are present more than once
# First check whether the rows are unique but I suspect it isn't the cause
dim(unique(spotify_songs)) # not it isnt't
## [1] 32828 23
# check song counts
dups <- spotify_songs[, .(song_cnt = .N), by = track_id][song_cnt > 1][order(-song_cnt)]
dups
## track_id song_cnt
## 1: 7BKLCZ1jbUBVqRi2FVlTVw 10
## 2: 14sOS5L36385FJ3OL8hew4 9
## 3: 3eekarcy7kvN4yt5ZFzltW 9
## 4: 6oJ6le65B3SEqPwMRNXWjY 8
## 5: 6wo37KVqFJhtuxPTpLCcfe 8
## ---
## 3161: 3rwdcyPQ37SSsf1loOpux9 2
## 3162: 0rCuRc07y6l1kPYj0JSRg5 2
## 3163: 1nas007nDbzLwDGwvMdz79 2
## 3164: 0jPU39bL0SrCmYc1RwbFkX 2
## 3165: 2sApUgcQQL2m0quuYgq0ec 2
dups_sum <- sum(dups$song_cnt)
# check the one with the most presence
spotify_songs[track_id == '7BKLCZ1jbUBVqRi2FVlTVw'][
,c('track_name', 'track_artist', 'track_album_name', 'playlist_name', 'playlist_id')]
## track_name track_artist track_album_name
## 1: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 2: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 3: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 4: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 5: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 6: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 7: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 8: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 9: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## 10: Closer (feat. Halsey) The Chainsmokers Closer (feat. Halsey)
## playlist_name
## 1: Dance Pop
## 2: Post pop teen
## 3: Electropop Hits 2017-2020
## 4: A Loose Definition of Indie Poptimism
## 5: <U+25E4> Hip Hop Dance Music – Urban – Trap – Breaking Locking Popping Bopping – WOD – World of Dance
## 6: Tropical House Run 190 BPM
## 7: 2020 Hits & 2019 Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
## 8: 2020 Hits & 2019 Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
## 9: 2020 Hits & 2019 Hits – Top Global Tracks <U+0001F525><U+0001F525><U+0001F525>
## 10: 2015 songs
## playlist_id
## 1: 37i9dQZF1DWZQaaqNMbbXa
## 2: 222nc9tKxKhfZ2GBrOpwH3
## 3: 7kyvBmlc1uSqsTL0EuNLrx
## 4: 4ZO0wp9G8FA3X6oYNBzda6
## 5: 0Hr2h94pKN8QAGVAgD6BsD
## 6: 37i9dQZF1DWSTc9FdySHtz
## 7: 4JkkvMpVl4lSioqQjeAL0q
## 8: 4JkkvMpVl4lSioqQjeAL0q
## 9: 4JkkvMpVl4lSioqQjeAL0q
## 10: 6UJw1egIcZVfrBmcKs5uHH
It seems like that a track being present in more playlists is causing this. If this is the case, then track_id and playlist_id together should be the granularity of the data. Let’s prove this:
# which equals
dim(dups)[1] - dups_sum
## [1] -4476
This equals number of records - distinct track ids. This was important to check because we can be quite sure that not the song changed but it was just included in more playlists. We will need to keep this in mind when analyzing the song attributes to not to include the same song multiple times! Next up is to inspect album name lengths.
# create var
spotify_songs[, album_name_length := nchar(track_album_name)]
# viz length distribution
ggplot(spotify_songs, aes(album_name_length)) +
geom_histogram(colour="darkblue", fill="lightblue") +
geom_vline(aes(xintercept=mean(album_name_length)),
color="purple", linetype="dashed", size=1) +
labs(title = "ALbum name char. length distribution") +
theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# check what can you make out of 50 chars
spotify_songs[album_name_length == 50][1:2,track_album_name]
## [1] "All My Friends (feat. Tinashe & Chance the Rapper)"
## [2] "Don't Leave Me Lonely (Purple Disco Machine Remix)"
# check what can you make out of 70 chars
spotify_songs[album_name_length == 70][1:2, track_album_name]
## [1] "Beautiful (feat. Camila Cabello) [Bazzi vs. EDX's Ibiza Sunrise Remix]"
## [2] "South of the Border (feat. Camila Cabello & Cardi B) [Sam Feldt Remix]"
# check what can you make out of 50 chars
spotify_songs[album_name_length == 90][1:2, track_album_name]
## [1] "Dancing Time, the Best of Eastern Nigeria's Afro Rock Exponents 1973-77 (Soundway Records)"
## [2] "Dancing Time, the Best of Eastern Nigeria's Afro Rock Exponents 1973-77 (Soundway Records)"
# check what can you make out of 50 chars
spotify_songs[album_name_length == 110][, track_album_name]
## [1] "Hard Rock Music: Clásicos del Rock, Baladas Heavy, Música Rock Melódico Alternativo de los Anos 60's 70's 80's"
## [2] "Hard Rock Music: Clásicos del Rock, Baladas Heavy, Música Rock Melódico Alternativo de los Anos 60's 70's 80's"
Well, these seem to reasonable names. But what about playlists? They should have rather short and not too specific names. Let’s do the same for playlists.
# create var
spotify_songs[, playlist_name_length := nchar(playlist_name)]
# viz length distribution
ggplot(spotify_songs, aes(playlist_name_length)) +
geom_histogram(colour="darkblue", fill="lightblue") +
geom_vline(aes(xintercept=mean(album_name_length)),
color="purple", linetype="dashed", size=1) +
labs(title = "Playlist name char. number distribution") +
theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# check what can you make out of 50 chars
spotify_songs[playlist_name_length > 50][1, playlist_name]
## [1] "Ultimate Indie Presents... Best Indie Tracks of the 2010s"
# check what can you make out of 70 chars
spotify_songs[playlist_name_length > 100][1, unique(playlist_name)]
## [1] "Techno House 2020 <U+0001F47D> Best Collection <U+0001F47B> Top DJ’s Electronic Music - Deep House - Trance - Tech House - Dance - Electro Pop"
U0001f47d are emoji encodings, so if we want to really analyze playlist name lengths, then we should filter these out. It could be done by R’s base enc2native() but I will just leave it this way because I just wanted to see name with long lenghts.
Check how subgenres relate to genres
spotify_songs[, unique(playlist_subgenre), by=playlist_genre]
## playlist_genre V1
## 1: pop dance pop
## 2: pop post-teen pop
## 3: pop electropop
## 4: pop indie poptimism
## 5: rap hip hop
## 6: rap southern hip hop
## 7: rap gangster rap
## 8: rap trap
## 9: rock album rock
## 10: rock classic rock
## 11: rock permanent wave
## 12: rock hard rock
## 13: latin tropical
## 14: latin latin pop
## 15: latin reggaeton
## 16: latin latin hip hop
## 17: r&b urban contemporary
## 18: r&b hip pop
## 19: r&b new jack swing
## 20: r&b neo soul
## 21: edm electro house
## 22: edm big room
## 23: edm pop edm
## 24: edm progressive electro house
## playlist_genre V1
From now on, the song attributes such as tempo, loudness etc.. will be analyzed. Since songs are duplicated, let’s filter for unique songs.
songs <- unique(spotify_songs, by = 'track_id')
dim(songs)
## [1] 28352 25
Which artists/albums are the most popular? Filter for more than 20 tracks.
# plot an artist ranking
# horizontal
ggplot(songs[, .(avg_popularity = mean(track_popularity),
pop_sd = sd(track_popularity),
track_count = .N
) , by = track_artist][track_count > 20][order(-avg_popularity)][1:10,]
) +
geom_bar( aes(x=reorder(track_artist, avg_popularity), y=avg_popularity), stat="identity", fill="skyblue", alpha=0.5) +
geom_errorbar( aes(x=track_artist,
ymin=avg_popularity-pop_sd,
ymax=avg_popularity+pop_sd), width=0.4, colour="orange", alpha=0.9, size=1.3) +
coord_flip() +
labs(title = 'Top 10 artists', x = 'artists')
The sd range is big due to the small number of observations. Do the same for albums. Here I think gotta be more cautios because I suspect album names to be not unique for different aritsts.
# get best albums by popularity only consider albums w. at least 5 tracks
fav_albums <- songs[, .(avg_popularity = mean(track_popularity),
pop_sd = sd(track_popularity),
track_count = .N
) , by = track_album_id][order(-avg_popularity)][track_count > 5][1:10,]
# merge to get artist names
fav_albums <- merge(fav_albums,
unique(songs, by = c('track_album_id', 'track_album_name', 'track_artist'))
[, c('track_album_id', 'track_album_name', 'track_artist')],
all.x =T, all.y = F, by = 'track_album_id')
# create var to display album name + performer
fav_albums[, artist_album:= do.call(paste, c(.SD, sep = " - ")),
.SDcols=c('track_artist', 'track_album_name')]
ggplot(fav_albums) +
geom_bar( aes(x=reorder(artist_album, avg_popularity), y=avg_popularity), stat="identity",
fill="skyblue", alpha=0.5) +
coord_flip() +
labs(title = 'Top 10 artists', x = 'artists')
I am really interested whether the playlist genres really group the alike songs. I consider alike songs that have their attributes like energy and loudness in a similar range. To do this, let’s apply PCA and cluster the songs and then check whether the clusters capture the genres.
library(caret)
## Loading required package: lattice
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
pca_vars <- colnames(songs)
# extract numeric vars
pca_vars <- pca_vars[!(startsWith(pca_vars, 'track') | startsWith(pca_vars, 'playlist') |
startsWith(pca_vars, 'album'))]
pca_result <- prcomp(songs[, pca_vars, with=F], scale. = T)
(pca_result$rotation)[, seq(1:5)]
## PC1 PC2 PC3 PC4 PC5
## danceability -0.038589823 0.59561843 -0.016516675 0.31435332 0.119081422
## energy 0.613189473 -0.02732757 0.015534194 0.09168641 0.030901600
## key 0.006032126 0.07712829 0.678512348 -0.01394207 -0.075493284
## loudness 0.554175947 0.08058271 -0.039621211 -0.05993675 0.120719682
## mode -0.009993168 -0.13621189 -0.677135004 0.06782140 -0.092004567
## speechiness 0.001006913 0.35511364 0.075934593 -0.45391938 -0.005170522
## acousticness -0.490747777 0.04576793 -0.038233534 -0.23650100 -0.018503291
## instrumentalness -0.072798645 -0.29053929 0.194321709 0.36870109 0.406964990
## liveness 0.162448199 -0.12713977 0.065513510 -0.37241774 -0.572072811
## valence 0.103545428 0.52332705 -0.120740090 0.19591640 -0.217598825
## tempo 0.178891130 -0.22515211 -0.007943339 -0.31801744 0.291021579
## duration_ms -0.003888301 -0.24526149 0.122894873 0.45635483 -0.575176207
fviz_pca(pca_result, label = F, alpha = 0.2)
Well there is a lot of overlap. Lets color the songs (dots) by genre, we might explore patterns
# create a data table from the pca result
pca_songs <- as.data.table(pca_result$x)
# assign rows as ids
pca_songs[, id:=row.names(pca_songs)]
# do the same for the `original` dataset
songs[, id:=row.names(songs)]
ggplot(merge(pca_songs, songs[, c('id', 'playlist_genre')],
all.x = T, all.y = T, by = 'id'),
aes(PC1, PC2, color = playlist_genre)) +
geom_point(alpha = 0.2)
Well, there is a lot of data points and a lot of overlap. If I may try hard enough, I may find some patterns between the genres but just by looking at this graph, it is nearly impossible. We would be better of if we analyzed this problem analytically, for example clustering in this PCA space and calculate for example dissimilarity scores between the clusters. But that would be out of the scope of this assignment.
Let’s make racing bars based on how many songs were made per year per subgenre. This page was really helpful when doings this: link
# create a Year col
songs[, Y := format(as.Date(track_album_release_date), "%Y") ]
# select relevant cols
data_racing <- songs[, c('Y', 'playlist_subgenre', 'track_id')]
data_racing <- na.omit(data_racing) # needed for the ggplot
data_racing <- data_racing[, .(cnt = .N), by = c('Y', 'playlist_subgenre' ) ]
setorderv(data_racing, c('Y', 'cnt'))
# cumulative count
data_racing[, cnt_cum := cumsum(cnt), by = c( 'playlist_subgenre')]
# not necessary col
data_racing[,cnt:=NULL]
data_racing[order(c('YM', 'playlist_subgenre'))]
## Y playlist_subgenre cnt_cum
## 1: 1957 urban contemporary 1
## 2: 1958 classic rock 1
# check if this works as expected
data_racing[playlist_subgenre == 'album rock'][1:8]
## Y playlist_subgenre cnt_cum
## 1: 1963 album rock 1
## 2: 1964 album rock 3
## 3: 1965 album rock 4
## 4: 1966 album rock 7
## 5: 1967 album rock 13
## 6: 1968 album rock 17
## 7: 1969 album rock 28
## 8: 1970 album rock 49
I really prefer pivoting data.frames and not data.tables
library(tidyr)
library(tidyverse)
# only to get the 'full' range of year for example there are years for which there was no tracks released in a genre
data_racing <- pivot_wider(data_racing, names_from = Y, values_from = cnt_cum )
data_racing <- pivot_longer(data_racing, -playlist_subgenre, names_to = "Y", values_to = "album_cumsum")
# forward fill values. If there was no track in 1970 in a genre then it will be imputed with the last value
data_racing <- data_racing %>%
group_by(playlist_subgenre) %>%
fill(album_cumsum)
# shape for ggplot
data_racing <- data_racing %>%
group_by(Y) %>%
arrange(-album_cumsum) %>%
mutate(rank = row_number()) %>%
filter( rank <= 10) %>%
arrange(Y)
library(gganimate)
gen_subgenre <- unique(songs[, c('playlist_genre', 'playlist_subgenre')])
data_racing <- merge(data_racing, gen_subgenre, by = 'playlist_subgenre', all.x = T, all.y = F)
data_racing$Y <- as.numeric(data_racing$Y)
# write_rds(racing_rds, 'racing.rds')
p <- data_racing %>%
ggplot(aes(x=-rank, y=album_cumsum, fill = playlist_genre)) +
geom_tile(aes(y = album_cumsum/2, height=album_cumsum),width=0.9)+
geom_text(aes(label=playlist_subgenre),
hjust="right",
colour="black",
fontface="bold",
nudge_y=-100)+
geom_text(aes(label=scales::comma(album_cumsum)),
hjust="left",
nudge_y=50,
colour="grey30")+
theme_minimal() +
coord_flip(clip="off") +
scale_x_discrete("") +
scale_y_continuous("",labels=scales::comma)+
scale_fill_brewer(palette = 'RdYlBu') +
theme(panel.grid.major.y=element_blank(),
panel.grid.minor.x=element_blank(),
plot.title= element_text(size=20,colour="grey50",face="bold"),
plot.caption = element_text(colour="grey50"),
plot.subtitle = element_text(size=20,colour="grey50",face="bold"),
plot.margin = margin(1,1,1,2,"cm"),
axis.text.y=element_blank())+
transition_time(Y) +
labs(title='Number of tracks released in {round(frame_time,0)}',
caption='made by Gerold')
# I couldn't make this work so in an rmdown so I animated and saved the gif in an R script
# rendering a video would help solving this but would have taken too much time to deal with this
#animate(p, duration = 30, fps = 25, end_pause = 20)
# anim_save("racing-bars.gif")